pip install category_encoders -q
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce
# Feature extraction
from imblearn import under_sampling, over_sampling
from sklearn.preprocessing import MinMaxScaler, StandardScaler
Data set yang digunakan diambil dari: https://www.kaggle.com/anmolkumar/health-insurance-cross-sell-prediction
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/My Drive/AstroBoys_Notebook/data/'
df_train = pd.read_csv(path + 'train.csv')
df_train.head()
df_train.info()
Pada penggambaran informasi data di atas, terlihat bahwa total terdapat total 12 kolom, 381.109 baris data, serta tidak ada data yang hilang. Dengan 3 kolom diantaranya bersifat kategoris (Gender, Vehicle_Age, Vehicle_Damage), dan 8 kolom sisanya bersifat numerik.
Definisi kolom:
df_train.isnull().sum().reset_index()
Pada tabel data di atas, terlihat bahwa dari semua 11 kolom yang tersedia, tidak ada satupun yang memiliki data Null (hilang).
Hanya 12.256336113815209 % dari semua populasi di mana response bernilai 1
df_train.describe()
Pada penggambaran data numerik di atas, terlihat bahwa secara umum, tidak ada data yang secara statistik terlihat aneh, terkecuali kolom Annual_Premium yang memiliki nilai maksimum yang sangat besar dan berbeda jauh dengan nilai minimum. Masukan datanya pun terlihat sama, yakni 381.109 masukan data.
df_train[['Gender','Vehicle_Age','Vehicle_Damage']].describe()
Pada penggambaran di atas, tampak bahwa data kategoris bersifat wajar. Dengan kolom Gender, memiliki 2 nilai unik, dengan modus yaitu kelompok data Male, dengan kemunculan sebanyak 206.089 kali. Di sisi lain, kolom Vehicle_Age, memiliki 2 nilai unik, dengan modus yaitu kelompok data 1-2 Year, dengan frekuensi nilai sebanyak 200.316. Pada kolom Gender, terlihat bahwa kolom tersebut memiliki 2 nilai unik, dengan modus yaitu kelompok data Yes, dengan kemunculan sebanyak 192.413 kali. Masukan datanya pun terlihat sama, yakni 381.109 masukan data.
features1a=['Age','Driving_License','Region_Code','Previously_Insured','Annual_Premium','Policy_Sales_Channel','Vintage','Response']
plt.figure(figsize=(12,20))
for i in range(0,len(features1a)):
plt.subplot(6,9,i+1)
sns.boxplot(y=df_train[features1a[i]],color='green',orient='v')
plt.tight_layout()
Dari grafik yang dihasilkan, terdapat Outliers yang cukup banyak untuk kolom Annual_Premium, dengan jumlah cukup yang besar dan jauh
plt.figure(figsize=(12,20))
for i in range(0,len(features1a)):
plt.subplot(6,9,i+1)
sns.violinplot(y=df_train[features1a[i]],color='blue',orient='v')
plt.tight_layout()
data_num1=df_train[features1a]
k=len(data_num1.columns)
n=3
m=(k-1)//n+1
fig,axes=plt.subplots(m,n,figsize=(n*5,m*3))
for i,(name,col) in enumerate(data_num1.iteritems()):
r,c=i//n,i%n
ax=axes[r,c]
col.hist(ax=ax,color='green')
ax2=col.plot.kde(ax=ax,secondary_y=True,title=name,color='red')
ax2.set_ylim(0)
fig.tight_layout()
Berdasarkan grafik di atas, terlihat bahwa distribusi Age bersifat Positively Skewed. Response pun juga cukup timpang
features1b=['Gender','Vehicle_Age','Vehicle_Damage']
plt.figure(figsize=(10,4))
for i in range(0,len(features1b)):
plt.subplot(1,3,i+1)
sns.countplot(x=df_train[features1b[i]],color='green',orient='v')
plt.tight_layout()
Dari grafik itu, tampak bahwa Vehicle_Damage memiliki distribusi data yang cukup seimbang. Sementara itu, untuk Vehicle_Age cukup timpang, dengan kategori nilai >2 Years kalah jauh dengan yang lain.
corr_=df_train[features1a].corr()
plt.figure(figsize=(16,10))
sns.heatmap(corr_,annot=True,fmt=".2f",cmap="BuPu");
Dari grafik Heatmap tersebut, sejauh ini tidak tampak kolom yang memiliki korelasi kuat (nilai > 0.7)
sns.pairplot(df_train[features1a],
diag_kind='kde',
plot_kws={'alpha':0.6,'s':80,'edgecolor':'k','color':'green'},
size=4);
plt.tight_layout()
sns.pairplot(df_train[features1a],
diag_kind='kde',hue='Response',
plot_kws={'alpha':0.6,'s':80,'edgecolor':'k','color':'green'},
size=4);
plt.tight_layout()
fig, (ax1, ax2) = plt.subplots(1,2,figsize=(15,7))
g=sns.catplot(x='Vehicle_Damage',y='Annual_Premium',hue='Response',data=df_train,ax=ax1)
g=sns.catplot(x='Vehicle_Damage',y='Annual_Premium',hue='Response',kind='swarm',data=df_train,ax=ax2)
fig,(ax1,ax2)=plt.subplots(nrows=1,ncols=2,figsize=(20,8))
g=sns.countplot('Gender',hue='Response',data=df_train,ax=ax1,palette='husl')
ax1.set_title('Response Rate by Gender')
g=sns.barplot(x='Gender',y='Response',data=df_train,ax=ax2)
ax2.set_title('Response Rate by Gender')
ax2.set_xlabel('Gender')
ax2.set_ylabel('Response Probability')
fig,(ax1,ax2)=plt.subplots(nrows=1,ncols=2,figsize=(20,8))
g=sns.countplot('Vehicle_Age',hue='Response',data=df_train,ax=ax1,palette='husl')
ax1.set_title('Response Rate by Vehicle Age')
g=sns.barplot(x='Vehicle_Age',y='Response',data=df_train,ax=ax2)
ax2.set_title('Response Rate by Vehicle Age')
ax2.set_xlabel('Vehicle_Age')
ax2.set_ylabel('Response Probability')
fig,(ax1,ax2)=plt.subplots(nrows=1,ncols=2,figsize=(20,8))
g=sns.countplot('Vehicle_Damage',hue='Response',data=df_train,ax=ax1,palette='husl')
ax1.set_title('Response Rate by Vehicle Damage')
g=sns.barplot(x='Vehicle_Damage',y='Response',data=df_train,ax=ax2)
ax2.set_title('Response Rate by Vehicle Damage')
ax2.set_xlabel('Vehicle_Damage')
ax2.set_ylabel('Response Probability')
data_missing_value = df_train.isnull().sum().reset_index()
data_missing_value
df_train.isnull().sum()
df_train.duplicated().sum()
features = ['Age','Driving_License','Region_Code','Previously_Insured','Annual_Premium','Policy_Sales_Channel','Vintage','Response']
plt.figure(figsize=(12,20))
for i in range(0, len(features)):
plt.subplot(6,9,i+1)
sns.boxplot(y = df_train[features[i]],color='Navy',orient='v')
plt.tight_layout()
Dapat dilihat pada variabel Annual_Premium terdapat outlier yang sangat banyak
data_tes = df_train
f,ax = plt.subplots(2,2,figsize=(18,15))
g = sns.distplot(data_tes['Annual_Premium'],kde=True, ax=ax[0,0])
ax[0,0].set_title('Annual_Premium - Original')
ax[0,0].set_xlabel('')
g = sns.boxplot(data_tes['Annual_Premium'],color='green',orient='h', ax=ax[0,1])
ax[0,1].set_title('Annual_Premium - Original')
ax[0,1].set_xlabel('')
g = sns.distplot(np.log1p(data_tes['Annual_Premium']+1),kde=True, ax=ax[1,0])
ax[1,0].set_title('Annual_Premium - log transformation')
ax[1,0].set_xlabel('')
g = sns.boxplot(np.log1p(data_tes['Annual_Premium']+1),color='green',orient='h', ax=ax[1,1])
ax[1,1].set_title('Annual_Premium - log transformation')
ax[1,1].set_xlabel('')
Q1 = data_tes['Annual_Premium'].quantile(0.25)
Q3 = data_tes['Annual_Premium'].quantile(0.75)
IQR = Q3 - Q1
low_limit = Q1 - (1.5 * IQR)
high_limit = Q3 + (1.5 * IQR)
filtered_entries = ((data_tes['Annual_Premium'] >= low_limit) & (data_tes['Annual_Premium'] <= high_limit))
data_tes = data_tes[filtered_entries]
data_tes.shape
features11 = ['Age','Driving_License','Region_Code','Previously_Insured','Annual_Premium','Policy_Sales_Channel','Vintage','Response']
plt.figure(figsize=(12,20))
for i in range(0, len(features11)):
plt.subplot(6,9,i+1)
sns.boxplot(y = data_tes[features11[i]],color='Red',orient='v')
plt.tight_layout()
df_merge = data_tes
merged_value = ['> 2 Years', '1-2 Year']
df_merge['Vehicle_Age'] = np.where(df_merge['Vehicle_Age'].isin(merged_value), '> 1 Year', '< 1 Year')
print(df_merge.shape)
print(df_train.shape)
df_merge.head()
# Fungsi normalisasi atau standardisasi
def normalize_standardize(data, op = 'standardize'):
if (op == 'standardize'):
sc_data = StandardScaler().fit_transform(data.values.reshape(len(data),1 ))
return sc_data
elif (op == 'normalize'):
sc_data = StandardScaler().MinMaxScaler().fit_transform(data.values.reshape(len(data),1 ))
return sc_data
else:
print("Operasi yang dimasukan bukan 'normalze' atau 'standardize'. Silakan coba lagi...")
return 0
# Pisahin antara numerik yang mau distandardisasi, ngilangin id, driving_license, previously_insured, dan response
numerical_column = ['Age', 'Annual_Premium', 'Policy_Sales_Channel','Vintage' ]
# Pisahin kolom string (objek)
object_column = list(df_train.select_dtypes(include = ['object']).columns)
# Bikin dataframe baru biar ga harus ulang dari awal kalau error
df_std = df_merge
# Standardisasi setiap kolom
for feature in numerical_column:
df_std[feature] = normalize_standardize(df_std[feature], 'standardize')
# Tampilkan data setelah standardisasi
df_std
# Buat dataframe baru
df_encoded = df_std
object_column = list(df_std.select_dtypes(include = ['object']).columns)
print("Shape before encoding:",df_encoded.shape)
print("Column to be encoded:",object_column)
# One Hot Encoding
for feature in object_column:
dummies = pd.get_dummies(df_encoded[feature], prefix=feature, drop_first = True)
# Append ke dataframe awal
df_encoded = pd.concat([df_encoded, dummies], axis=1)
print("Shape after encoding:", df_encoded.shape)
# Hapus kolom yang sudah di encoding
df_encoded = df_encoded.drop(object_column,axis= 1)
print("Shape after dropping column:", df_encoded.shape)
df_encoded.head()
# Change Region Code type to string
df_encoded['Region_Code'] = df_encoded['Region_Code'].astype(str)
print('Ada',df_encoded['Region_Code'].value_counts().count(), 'unique value pada Region_Code yang perlu diencode')
# Encoding with Binary Encoder
rc_encoded = ce.BinaryEncoder().fit_transform(df_encoded['Region_Code'])
print('Jumlah kolom yang terbentuk dari proses encoding region code:', rc_encoded.shape[1])
df_encoded = pd.concat([df_encoded, rc_encoded], axis = 1)
df_encoded = df_encoded.drop(['Region_Code', 'Region_Code_0'], axis =1)
df_encoded.head()
# Pesebaran sebelum handling class imbalance
sns.countplot(x = 'Response', data = df_encoded)
from imblearn import under_sampling, over_sampling
# Pemisahan fitur dan target
X = df_encoded.drop(['Response'],axis=1)
x_columns = list(X.columns)
y = df_encoded['Response']
# Under sampling
X_under, y_under = under_sampling.RandomUnderSampler(random_state=42).fit_resample(X, y)
# Mengubah numpy array ke Dataframe agar bisa diconcat
X_under = pd.DataFrame(X_under)
y_under = pd.DataFrame(y_under)
# Rename lagi biar namanya ga 0,1,2,3,4,....
X_under.columns = x_columns
y_under = y_under.rename(columns = {0: 'Response'})
# Concat dan buat dataframe baru
df_under = pd.concat([X_under,y_under], axis = 1)
# Hapus id
df_under = df_under.drop('id',axis =1)
df_under.head()
# Setelah class imbalance
sns.countplot(x = 'Response', data = df_under)
print(df_under['Response'].value_counts())
print(df_under.shape)
# Buat df baru
df_final = df_under
features = list(df_final.columns)
corr_= df_final[features].corr()
plt.figure(figsize=(16,10))
sns.heatmap(corr_, annot=True, fmt = ".2f", cmap = "BuPu")
Pada tahap awal, akan digunakan keseluruhan fiturnya terlebih dahulu (hanya dengan menghapus id)
Untuk tahap selanjutnya apabila performa model kurang baik, akan digunakan fitur demikian:
Dan membuang:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/My Drive/AstroBoys_Notebook/data/'
df= pd.read_csv(path + 'train.csv')
df.head()
plt.figure(figsize=(10,8))
sns.countplot('Vehicle_Damage', hue ='Response', data = df)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel(xlabel = 'Is owner vehicle has even damaged?',fontsize=15)
plt.ylabel(ylabel = 'Number of People',fontsize=15)
plt.legend(title = 'Get insurance', labels = ['No','Yes'], fontsize = 12)
df_insight_damage = df.groupby(['Vehicle_Damage','Response'])['id'].count().reset_index().rename(columns={'id' : 'count'})
res_insight_damage = list(df_insight_damage['Response'])
dam_insight_damage = list(df_insight_damage['Vehicle_Damage'])
count_insight_damage = list(df_insight_damage['count'])
for i in range(0,len(res_insight_damage)):
plt.text(x = (0 if dam_insight_damage[i] == 'Yes' else 1) + (-0.3 if res_insight_damage[i]%2 == 0 else 0.13)
, y = count_insight_damage[i] +3000
, s=str(count_insight_damage[i])
, fontsize=13
, fontweight='bold')
plt.text(x =-0.8, y= 230000, s = 'People who had their car damaged most like take the insurance', fontweight = 'bold', fontsize = 18)
plt.text(x =-0.8, y= 220000, s = 'The bad experience that someone has with their vehicle will make people think ', fontsize = 14)
plt.text(x =-0.8, y= 210000, s = 'more about taking out insurance', fontsize = 14)
df_insight_region = df[df['Response'] == 1].groupby('Region_Code').count().reset_index().sort_values('Response', ascending = False).head(5)
df_insight_region['Region_Code'] = df_insight_region['Region_Code'].astype(int)
plt.figure(figsize = (10,8))
sns.barplot(x = 'Region_Code', y = 'Response', data =df_insight_region)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel(xlabel = 'Region Code',fontsize=15)
plt.ylabel(ylabel = 'Count',fontsize=15)
# df_insight_region
cnt_insight_region = list(df_insight_region.sort_values('Region_Code')['Response'])
reg_insight_region = list(df_insight_region.sort_values('Region_Code')['Region_Code'])
for i in range(0,len(cnt_insight_region)):
plt.text(x = i -0.2
, y = cnt_insight_region[i] + 300
, s=str(cnt_insight_region[i])
, fontsize=13
, fontweight='bold')
plt.text(x =-1, y= 22500, s = 'People who are in region code 28 tend to choose to use insurance', fontweight = 'bold', fontsize = 18)
plt.text(x =-1, y= 21500, s = 'Region 28 is the largest contributor to people using insurance ', fontsize = 14)
plt.text(x =2.5, y= 20000, s = '*data is taken from people who are',fontstyle = 'italic', fontsize = 12)
plt.text(x =2.9, y= 19300, s = 'confirmed to take insurance',fontstyle = 'italic', fontsize = 12)
df_insight_region_0 = df[df['Response'] == 0].groupby('Region_Code').count().reset_index().sort_values('Response', ascending = False).head(10)
reg_insight_region_0 = list(df_insight_region_0.sort_values('Region_Code')['Region_Code'])
df_insight_region_1 = df[df['Response'] == 1].groupby('Region_Code').count().reset_index().sort_values('Response', ascending = False).head(10)
reg_insight_region_1 = list(df_insight_region_1.sort_values('Region_Code')['Region_Code'])
df_car_damage_0 = df[(df['Response'] == 0) & (df['Region_Code'].isin(reg_insight_region_0))]
df_car_damage_1 = df[(df['Response'] == 1) & (df['Region_Code'].isin(reg_insight_region_1))]
df_merge_car_damage = pd.concat([df_car_damage_0,df_car_damage_1])
df_merge_car_damage['Region_Code'] = df_merge_car_damage['Region_Code'].astype(int)
plt.figure(figsize=(15,12))
sns.countplot(y ='Region_Code', hue = 'Response', data= df_merge_car_damage)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.ylabel(ylabel = 'Region Code',fontsize=15)
plt.xlabel(xlabel = 'Count',fontsize=15)
plt.grid()
df_nol = df_merge_car_damage[df_merge_car_damage['Response'] == 0]['Region_Code'].value_counts().reset_index()
xlabel_reg_0 = list(df_nol['index'])
ylabel_reg_0 = list(df_nol['Region_Code'])
df_satu= df_merge_car_damage[df_merge_car_damage['Response'] == 1]['Region_Code'].value_counts().reset_index()
xlabel_reg_1 = list(df_satu['index'])
ylabel_reg_1 = list(df_satu['Region_Code'])
x_percentage = list(df_merge_car_damage.sort_values('Region_Code')['Region_Code'].unique())
y_percentage = []
y_real_value = []
print(x_percentage)
def search_list(list, x):
for i in range(0,len(list)):
if(list[i] == x):
return True, i
return False, -1
for i in x_percentage:
res1, id1 = search_list(xlabel_reg_0, i)
res2, id2 = search_list(xlabel_reg_1, i)
if (res1 and res2):
y_percentage.append(100*(ylabel_reg_1[id2]/ylabel_reg_0[id1]))
y_real_value.append(ylabel_reg_1[id2])
else:
y_percentage.append(0)
if res2:
y_real_value.append(ylabel_reg_1[id2])
else:
y_real_value.append(0)
for i in range(0, len(x_percentage)):
plt.text(y = i + 0.3
, x = y_real_value[i] + 300
, s=str(round(y_percentage[i],2)) + '%'
, fontsize=12)
plt.text(x =0, y= -1.3, s = 'Opportunities for people in region 28 to take part in the insurance program are 23.03%', fontweight = 'bold', fontsize = 18)
plt.text(x =0, y= -1, s = 'Region code 28 is the region that contributes the most to our vehicle ', fontsize = 14)
plt.text(x =0, y= -0.7, s = 'insurance customers, followed by region code 29', fontsize = 14)
df_insight_region = df[df['Vehicle_Damage'] == 'Yes'].groupby('Region_Code').count().reset_index().sort_values('Vehicle_Damage', ascending = False).head(5)
df_insight_region
fig, ax = plt.subplots(figsize = (12,8))
sns.barplot(x = 'Region_Code', y = 'Vehicle_Damage', data =df_insight_region, ax = ax)
plt.xticks(ticks = [0,1,2,3,4], labels = ['Rajasthan', 'Mizoram', 'Tamil Nadu', 'Ladakh', 'Kerala'], fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel(xlabel = 'Region Name',fontsize=15)
plt.ylabel(ylabel = 'Count of Damaged Vehicle',fontsize=15)
# df_insight_region
cnt_insight_region = list(df_insight_region.sort_values('Region_Code')['Response'])
reg_insight_region = list(df_insight_region.sort_values('Region_Code')['Region_Code'])
for i in range(0,len(cnt_insight_region)):
plt.text(x = i -0.3
, y = cnt_insight_region[i] + 300
, s=str(cnt_insight_region[i])
, fontsize=18
, fontweight='bold')
plt.text(x =-0.6, y= 80000, s = 'The increase in the number of insurance users in region 28', fontweight = 'bold', fontsize = 22)
plt.text(x =0.2, y= 76000, s = '(Tamil Nadu) was due to many damaged cars', fontweight = 'bold', fontsize = 22)
from matplotlib.patches import Rectangle
import matplotlib.patches as patches
ax.add_patch(Rectangle((1.45, 0), 1.1, 75000, fill=True, facecolor ='red', alpha=0.1))
ax.add_patch(Rectangle((1.45, 0), 1.1, 75000, fill=None, edgecolor='red', alpha=1, linestyle = '--', linewidth = 2))
fig, ax = plt.subplots(figsize = (12,8))
sns.distplot(df[df['Response'] == 0]['Age'],hist = False, kde_kws ={"lw" :3}, ax = ax)
sns.distplot(df[df['Response'] == 1]['Age'],hist = False, kde_kws ={"lw" :3}, ax = ax)
plt.legend(title = 'Response', labels = ['No', 'Yes'], fontsize = 12)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel(xlabel = 'Age',fontsize=15)
plt.ylabel(ylabel = 'PDF',fontsize=15)
from matplotlib.patches import Rectangle
import matplotlib.patches as patches
ax.add_patch(Rectangle((30, 0), 32, 0.07, fill=True, facecolor ='red', alpha=0.1))
ax.add_patch(Rectangle((30, 0), 32, 0.07, fill=None, edgecolor='red', alpha=1, linestyle = '--', linewidth = 2))
plt.text(x =20, y= 0.073, s = 'The age range of 30-62 years is the age range in which ', fontweight = 'bold', fontsize = 18)
plt.text(x =35, y= 0.07, s = 'it is possible to take out insurance', fontweight = 'bold', fontsize = 18)
countmax_age = df_train[df_train['Response'] == 1].groupby('Age').count().reset_index().sort_values('id', ascending = False).iloc[0,1]
ax.annotate('Maximum at age 44', xy=(44, 0.038), xytext=(62, 0.05),
arrowprops=dict(facecolor='black', shrink=0.05), fontsize = 18)
plt.text(x =65, y= 0.047, s = 'with ' + str(countmax_age) + ' people', fontsize = 18)
fig, ax = plt.subplots(figsize=(12,8))
sns.countplot('Response', data = df_train, ax = ax)
plt.xticks(ticks = [0, 1], labels = ['No','Yes'], fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel(xlabel = 'Response',fontsize=15)
plt.ylabel(ylabel = 'Count',fontsize=15)
percentage_response = []
count = []
count.append(df['Response'].value_counts().reset_index()['Response'][0])
count.append(df['Response'].value_counts().reset_index()['Response'][1])
total = sum(count)
percentage_response.append(count[0]/(count[0]+count[1]) * 100)
percentage_response.append(count[1]/(count[0]+count[1]) * 100)
for i in range(0,len(percentage_response)):
plt.text(x = i - 0.10
, y = count[i] - 20000
, s=str(round(count[i]))
, fontsize=18
, fontweight='bold'
, color = 'white')
plt.text(x = i - 0.15
, y = count[i] + 3000
, s=str(round(percentage_response[i], 4)) + '%'
, fontsize=22
, fontweight='bold')
plt.text(x =1.1, y= 335000, s = 'Total: ' + str(total), fontsize = 18)
plt.text(x =-0.6, y= 400000, s = 'There are only about 12.25% of people who are willing', fontweight = 'bold', fontsize = 24)
plt.text(x =-0.6, y= 375000, s = 'to take an offer of vehicle insurance', fontweight = 'bold', fontsize = 24)
plt.savefig('responsecount.png', bbox_inches = 'tight')
fig, ax=plt.subplots(figsize=(12,8))
sns.countplot('Vehicle_Age',hue='Response',data=df_train ,ax=ax,palette='husl', order=["< 1 Year", "1-2 Year", "> 2 Years"])
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel(xlabel = 'Vehicle Age',fontsize=15)
plt.ylabel(ylabel = 'Count',fontsize=15)
plt.text(x =-0.9, y= 188000, s = 'In general, customers with older vehicles also prefer a interested', fontweight = 'bold', fontsize = 22)
plt.text(x =-0.9, y= 179000, s = 'response, compared to owners with younger vehicles', fontweight = 'bold', fontsize = 22)
plt.legend(title = 'Get insurance', labels = ['No','Yes'], fontsize = 12)
df_total = df_train.groupby('Vehicle_Age').count().reset_index()
total = []
total.append(df_total.iloc[1,1])
total.append(df_total.iloc[0,1])
total.append(df_total.iloc[2,1])
df_year = df_train[df_train['Response'] == 1].groupby('Vehicle_Age').count().reset_index()
percentage_year = []
percentage_year.append(df_year.iloc[1,1]*100/total[0])
percentage_year.append(df_year.iloc[0,1]*100/total[1])
percentage_year.append(df_year.iloc[2,1]*100/total[2])
count = []
count.append(df_year.iloc[1,1])
count.append(df_year.iloc[0,1])
count.append(df_year.iloc[2,1])
for i in range(0,len(percentage_year)):
plt.text(x = i
, y = count[i] + 3000
, s=str(round(percentage_year[i],4)) + '%'
, fontsize=20
, fontweight='bold')
Insight: Mayoritas Pelanggan yang memiliki kendaraan dengan umur lebih tua (1-2 tahun & >2 tahun) tampak lebih responsif dengan penawaran Asuransi Kendaraan. Secara umum, pelanggan dengan kendaraan tua juga lebih memilih respon 'tertarik', dibandingkan dengan kendaraan muda
Insights & Tips Summary:
df_results = pd.DataFrame(columns = ['Method', 'Precision', 'Recall', 'AUC'])
df_results
df_results.sort_values('AUC', ascending = False)
df_results.sort_values('Precision', ascending = False)
df_results.sort_values('Recall', ascending = False)
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from scipy.stats import uniform
# Hyperparameters
penalty = ['l2']
C = [0.2,0.22,0.24, 0.26, 0.28, 0.3, 0.32, 0.36]
# Dict
hyperparameters = dict(penalty=penalty, C=C)
classifier = LogisticRegression(random_state = 42)
clf = RandomizedSearchCV(classifier, hyperparameters, cv = 5, random_state=42, scoring='', verbose = 1, n_jobs=-1)
best_model = clf.fit(X_train, y_train)
print(best_model.best_estimator_)
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score, precision_score, recall_score
print('\nConfusion matrix')
print(confusion_matrix(y_test, best_model.predict(X_test)))
from sklearn.metrics import accuracy_score
print('\nPrecision')
print(precision_score(y_test, best_model.predict(X_test)))
print('\nRecall')
print(recall_score(y_test, best_model.predict(X_test)))
from sklearn.metrics import classification_report
print('\nClassification report')
print(classification_report(y_test, best_model.predict(X_test))) # generate the precision, recall, f-1 score, num
df_results = df_results.append({ 'Method' : 'Logistic Regression',
'Precision' : precision_score(y_test, best_model.predict(X_test)),
'Recall' : recall_score(y_test, best_model.predict(X_test)),
'AUC' : roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1])
}, ignore_index = True)
from sklearn.metrics import roc_curve, auc, roc_auc_score
fpr, tpr, _ = roc_curve(y_test, best_model.predict_proba(X_test)[:,1])
plt.title('Logistic Regression')
plt.xlabel('FPR (Precision)')
plt.ylabel('TPR (Recall)')
plt.plot(fpr,tpr)
plt.plot((0,1), ls='dashed',color='black')
plt.show()
print ('Area under curve (AUC): ', auc(fpr,tpr))
print (roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1]))
import pickle
filename = 'logistic.sav'
pickle.dump(best_model, open(filename, 'wb'))
filename = 'logistic.sav'
best_model = pickle.load(open(filename, 'rb'))
best_model.best_estimator_
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import uniform
# Hyperparameters
n_neighbors = [3, 5, 7, 9, 11, 13]
metric = ['euclidean', 'manhattan', 'minkowski']
algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']
# Dict
hyperparameters = dict(n_neighbors=n_neighbors, metric=metric, algorithm = algorithm)
classifier = KNeighborsClassifier()
clf = RandomizedSearchCV(classifier, hyperparameters, cv = 5, random_state=42, scoring='roc_auc', verbose = 1, n_jobs = -1)
best_model = clf.fit(X_train, y_train)
print(best_model.best_estimator_)
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score, precision_score, recall_score
print('\nConfusion matrix')
print(confusion_matrix(y_test, best_model.predict(X_test)))
from sklearn.metrics import accuracy_score
print('\nPrecision')
print(precision_score(y_test, best_model.predict(X_test)))
print('\nRecall')
print(recall_score(y_test, best_model.predict(X_test)))
from sklearn.metrics import classification_report
print('\nClassification report')
print(classification_report(y_test, best_model.predict(X_test))) # generate the precision, recall, f-1 score, num
df_results = df_results.append({ 'Method' : 'KNN',
'Precision' : precision_score(y_test, best_model.predict(X_test)),
'Recall' : recall_score(y_test, best_model.predict(X_test)),
'AUC' : roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1])
}, ignore_index = True)
from sklearn.metrics import roc_curve, auc, roc_auc_score
fpr, tpr, _ = roc_curve(y_test, best_model.predict_proba(X_test)[:,1])
plt.title('KNN')
plt.xlabel('FPR (Precision)')
plt.ylabel('TPR (Recall)')
plt.plot(fpr,tpr)
plt.plot((0,1), ls='dashed',color='black')
plt.show()
print ('Area under curve (AUC): ', auc(fpr,tpr))
print (roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1]))
import pickle
filename = 'knn.sav'
pickle.dump(best_model, open(filename, 'wb'))
filename = 'knn.sav'
best_model = pickle.load(open(filename, 'rb'))
best_model.best_estimator_
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
y_pred_proba = classifier.predict_proba(X_test)
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score, precision_score, recall_score
print('\nConfusion matrix')
print(confusion_matrix(y_test, best_model.predict(X_test)))
from sklearn.metrics import accuracy_score
print('\nPrecision')
print(precision_score(y_test, best_model.predict(X_test)))
print('\nRecall')
print(recall_score(y_test, best_model.predict(X_test)))
from sklearn.metrics import classification_report
print('\nClassification report')
print(classification_report(y_test, best_model.predict(X_test))) # generate the precision, recall, f-1 score, num
df_results = df_results.append({ 'Method' : 'Naive Bayes',
'Precision' : precision_score(y_test, best_model.predict(X_test)),
'Recall' : recall_score(y_test, best_model.predict(X_test)),
'AUC' : roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1])
}, ignore_index = True)
from sklearn.metrics import roc_curve, auc, roc_auc_score
fpr, tpr, _ = roc_curve(y_test, best_model.predict_proba(X_test)[:,1])
plt.title('Naive Bayes')
plt.xlabel('FPR (Precision)')
plt.ylabel('TPR (Recall)')
plt.plot(fpr,tpr)
plt.plot((0,1), ls='dashed',color='black')
plt.show()
print ('Area under curve (AUC): ', auc(fpr,tpr))
print (roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1]))
import pickle
filename = 'naiveb.sav'
pickle.dump(classifier, open(filename, 'wb'))
filename = 'naiveb.sav'
best_model = pickle.load(open(filename, 'rb'))
print("Done")
best_model.best_estimator_
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
#Hyper Parameter
max_depth = [int(x) for x in np.linspace(1, 110, num = 30)] # Maximum number of levels in tree
min_samples_split = [2, 5, 10, 100] # Minimum number of samples required to split a node
min_samples_leaf = [1, 2, 4, 10, 20, 50] # Minimum number of samples required at each leaf node
max_features = ['auto', 'sqrt'] # Number of features to consider at every split
criterion= ['gini', 'entropy']
# Dict
hyperparameters = {
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'max_features': max_features,
'criterion' : criterion
}
classifier = DecisionTreeClassifier(random_state = 42)
clf = RandomizedSearchCV(classifier, hyperparameters, cv = 5, random_state=42, n_iter = 15, scoring='roc_auc', verbose = 1, n_jobs = -1)
best_model = clf.fit(X_train, y_train)
print(best_model.best_estimator_)
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score, precision_score, recall_score
print('\nConfusion matrix')
print(confusion_matrix(y_test, best_model.predict(X_test)))
from sklearn.metrics import accuracy_score
print('\nPrecision')
print(precision_score(y_test, best_model.predict(X_test)))
print('\nRecall')
print(recall_score(y_test, best_model.predict(X_test)))
from sklearn.metrics import classification_report
print('\nClassification report')
print(classification_report(y_test, best_model.predict(X_test))) # generate the precision, recall, f-1 score, num
df_results = df_results.append({ 'Method' : 'Decision Tree',
'Precision' : precision_score(y_test, best_model.predict(X_test)),
'Recall' : recall_score(y_test, best_model.predict(X_test)),
'AUC' : roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1])
}, ignore_index = True)
from sklearn.metrics import roc_curve, auc, roc_auc_score
fpr, tpr, _ = roc_curve(y_test, best_model.predict_proba(X_test)[:,1])
plt.title('Logistic Regression')
plt.xlabel('FPR (Precision)')
plt.ylabel('TPR (Recall)')
plt.plot(fpr,tpr)
plt.plot((0,1), ls='dashed',color='black')
plt.show()
print ('Area under curve (AUC): ', auc(fpr,tpr))
print (roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1]))
importance = best_model.best_estimator_.feature_importances_
feat_importances = pd.Series(importance, index= pd.Series(df.drop('Response', axis = 1).columns))
# feat_importances.plot(kind ="barh")
feat_importances.nlargest(10).plot(kind='barh')
plt.xlabel('score')
plt.ylabel('feature')
plt.title('feature importance score')
import pickle
filename = 'dectree.sav'
pickle.dump(best_model, open(filename, 'wb'))
filename = 'dectree.sav'
best_model = pickle.load(open(filename, 'rb'))
best_model.best_estimator_
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
#Hyper Parameter
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)] # Number of trees in random forest
max_features = ['auto', 'sqrt', 'log2'] # Number of features to consider at every split
max_depth = [int(x) for x in np.linspace(10, 110, num = 5)] # Maximum number of levels in tree
min_samples_split = [int(x) for x in np.linspace(start = 2, stop = 10, num = 5)] # Minimum number of samples required to split a node
min_samples_leaf = [int(x) for x in np.linspace(start = 1, stop = 10, num = 5)] # Minimum number of samples required at each leaf node
bootstrap = [True, False] # Method of selecting samples for training each tree
n_jobs = [-1]
#Menjadikan ke dalam bentuk dictionary
random_search = {'criterion': ['entropy','gini'],
'max_depth': max_depth,
'min_samples_leaf': min_samples_split,
'min_samples_split': min_samples_leaf,
'n_estimators': n_estimators,
'max_features' : max_features}
# random_search = {'criterion': ['entropy','gini'],
# 'max_depth': [10],
# 'min_samples_leaf': [6],
# 'min_samples_split': [7],
# 'n_estimators': [300]}
classifier = RandomForestClassifier(random_state = 42)
clf = RandomizedSearchCV(classifier, random_search, cv = 5, random_state=42, scoring='roc_auc', verbose = 4, n_jobs = -1)
best_model = clf.fit(X_train, y_train)
print(best_model.best_estimator_)
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score
print('\nConfusion matrix')
print(confusion_matrix(y_test, y_pred))
from sklearn.metrics import accuracy_score
print('\nAccuracy')
print(accuracy_score(y_test, y_pred))
print('\nF1_Score')
print(f1_score(y_test, best_model.predict(X_test)))
from sklearn.metrics import classification_report
print('\nClassification report')
print(classification_report(y_test, y_pred)) # generate the precision, recall, f-1 score, num
df_results = df_results.append({ 'Method' : 'Random Forest',
'Precision' : precision_score(y_test, best_model.predict(X_test)),
'Recall' : recall_score(y_test, best_model.predict(X_test)),
'AUC' : roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1])
}, ignore_index = True)
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(y_test, y_pred_proba[:,1])
plt.title('Random Forest ROC curve: CC Fraud')
plt.xlabel('FPR (Precision)')
plt.ylabel('TPR (Recall)')
plt.plot(fpr,tpr)
plt.plot((0,1), ls='dashed',color='black')
plt.show()
print ('Area under curve (AUC): ', auc(fpr,tpr))
roc_auc_score(y_test, y_pred_proba[:,1])
import pickle
filename = 'rforest1.sav'
pickle.dump(best_model, open(filename, 'wb'))
filename = 'rforest1.sav'
best_model = pickle.load(open(filename, 'rb'))
best_model.best_estimator_
importance = best_model.best_estimator_.feature_importances_
feat_importances = pd.Series(importance, index= pd.Series(df.drop('Response', axis = 1).columns))
# feat_importances.plot(kind ="barh")
feat_importances.nlargest(10).plot(kind='barh')
plt.xlabel('score')
plt.ylabel('feature')
plt.title('feature importance score')
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold
from keras.metrics import AUC
from sklearn.model_selection import cross_val_score
def create_baseline():
model = Sequential()
model.add(Dense(15, input_dim = 15, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer = 'adam', metrics=[AUC()])
return model
cvscores = []
kfold = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 42)
for train, test in kfold.split(X_train,y_train):
model = create_baseline()
history = model.fit(X_train, y_train, epochs = 3, batch_size = 32, verbose = 1, validation_data =(X_test,y_test))
scores = model.evaluate(X_test, y_test, verbose = 1)
print("\n %s: %.2f%%\n---------\n" % (model.metrics_names[1], scores[1]*100))
cvscores.append(scores[1] * 100)
print("AUC Result for Testing")
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))
# kfold = StratifiedKFold(n_splits = 3, shuffle = True)
# results = cross_val_score(model, X, y, cv = kfold, scoring = 'roc_auc')
# print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
model.summary()
y_pred = model.predict(X_test, batch_size = 32)
y_pred = np.where(y_pred >= 0.5, 1, 0)
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score, precision_score, recall_score
print('\nConfusion matrix')
print(confusion_matrix(y_test, best_model.predict(X_test)))
from sklearn.metrics import accuracy_score
print('\nPrecision')
print(precision_score(y_test, best_model.predict(X_test)))
print('\nRecall')
print(recall_score(y_test, best_model.predict(X_test)))
from sklearn.metrics import classification_report
print('\nClassification report')
print(classification_report(y_test, best_model.predict(X_test))) # generate the precision, recall, f-1 score, num
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(y_test, model.predict(X_test, batch_size = 32)[:,0])
plt.title('Random Forest ROC curve: CC Fraud')
plt.xlabel('FPR (Precision)')
plt.ylabel('TPR (Recall)')
plt.plot(fpr,tpr)
plt.plot((0,1), ls='dashed',color='black')
plt.show()
print ('Area under curve (AUC): ', auc(fpr,tpr))
import h5py
#Serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
json_file.write(model_json)
#Serialize weights to HDF5
model.save_weights("ANN.h5")
print("Saved model to disk")
df_results = df_results.append({ 'Method' : 'ANN',
'Precision' : precision_score(y_test, y_pred),
'Recall' : recall_score(y_test, y_pred),
'AUC' : np.mean(cvscores)/100
}, ignore_index = True)
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
#Hyper Paramete
hyperparameters = {
"learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ],
"max_depth": [ 3, 4, 5, 6, 8, 10, 12, 15],
"min_child_weight": [ 1, 3, 5, 7 ],
"gamma": [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
"colsample_bytree": [ 0.3, 0.4, 0.5 , 0.7 ],
"eta":[.3, .2, .1, .05, .01, .005]
}
classifier = xgb.XGBClassifier(random_state = 42)
clf = RandomizedSearchCV(classifier, hyperparameters, cv = 5, random_state=42, scoring='roc_auc', verbose = 4, n_jobs = -1)
best_model = clf.fit(X_train, y_train)
print(best_model.best_estimator_)
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score, precision_score, recall_score
print('\nConfusion matrix')
print(confusion_matrix(y_test, best_model.predict(X_test)))
from sklearn.metrics import accuracy_score
print('\nPrecision')
print(precision_score(y_test, best_model.predict(X_test)))
print('\nRecall')
print(recall_score(y_test, best_model.predict(X_test)))
from sklearn.metrics import classification_report
print('\nClassification report')
print(classification_report(y_test, best_model.predict(X_test))) # generate the precision, recall, f-1 score, num
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(y_test, y_pred_proba[:,1])
plt.title('Random Forest ROC curve: CC Fraud')
plt.xlabel('FPR (Precision)')
plt.ylabel('TPR (Recall)')
plt.plot(fpr,tpr)
plt.plot((0,1), ls='dashed',color='black')
plt.show()
print ('Area under curve (AUC): ', auc(fpr,tpr))
df_results = df_results.append({ 'Method' : 'XGB',
'Precision' : precision_score(y_test, best_model.predict(X_test)),
'Recall' : recall_score(y_test, best_model.predict(X_test)),
'AUC' : roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1])
}, ignore_index = True)
importance = best_model.best_estimator_.feature_importances_
feat_importances = pd.Series(importance, index= pd.Series(df.drop('Response', axis = 1).columns))
# feat_importances.plot(kind ="barh")
feat_importances.nlargest(10).plot(kind='barh')
plt.xlabel('score')
plt.ylabel('feature')
plt.title('feature importance score')
import pickle
filename = 'xgb.sav'
pickle.dump(best_model, open(filename, 'wb'))
import pickle
import xgboost as xgb
filename = 'xgb.sav'
best_model = pickle.load(open(filename, 'rb'))
best_model.best_estimator_
pip install xgboost --upgrade
from sklearn import tree
import matplotlib.pyplot as plt
from xgboost import plot_tree
fig, ax = plt.subplots(figsize=(80,80))
plot_tree(best_model.best_estimator_, ax = ax, num_trees = 0 )
df.info()
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import AdaBoostClassifier
#Hyper Paramete
hyperparameters = {
'n_estimators': [10, 50, 100, 500, 1000, 5000],
'learning_rate':np.arange(0.1, 2.1, 0.1)
}
classifier = AdaBoostClassifier(random_state=42)
clf = RandomizedSearchCV(classifier, hyperparameters, cv = 5, random_state=42, scoring='roc_auc', verbose = 4, n_jobs = -1)
best_model = clf.fit(X_train, y_train)
print(best_model.best_estimator_)
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score, precision_score, recall_score
print('\nConfusion matrix')
print(confusion_matrix(y_test, best_model.predict(X_test)))
from sklearn.metrics import accuracy_score
print('\nPrecision')
print(precision_score(y_test, best_model.predict(X_test)))
print('\nRecall')
print(recall_score(y_test, best_model.predict(X_test)))
from sklearn.metrics import classification_report
print('\nClassification report')
print(classification_report(y_test, best_model.predict(X_test))) # generate the precision, recall, f-1 score, num
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(y_test, y_pred_proba[:,1])
plt.title('Random Forest ROC curve: CC Fraud')
plt.xlabel('FPR (Precision)')
plt.ylabel('TPR (Recall)')
plt.plot(fpr,tpr)
plt.plot((0,1), ls='dashed',color='black')
plt.show()
print ('Area under curve (AUC): ', auc(fpr,tpr))
importance = best_model.best_estimator_.feature_importances_
feat_importances = pd.Series(importance, index= pd.Series(df.drop('Response', axis = 1).columns))
# feat_importances.plot(kind ="barh")
feat_importances.nlargest(10).plot(kind='barh')
plt.xlabel('score')
plt.ylabel('feature')
plt.title('feature importance score')
df_results = df_results.append({ 'Method' : 'AdaBoost',
'Precision' : precision_score(y_test, best_model.predict(X_test)),
'Recall' : recall_score(y_test, best_model.predict(X_test)),
'AUC' : roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1])
}, ignore_index = True)
import pickle
filename = 'ada.sav'
pickle.dump(best_model, open(filename, 'wb'))
train = pd.read_csv('data-stage1-31012021.csv')
# split independent variabel dan dependent variabel
X = train.drop('Response', axis = 1).values
y = train['Response'].values
# split test dan train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42, stratify = y, shuffle = True)
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
roc_auc_list = []
roc_auc_holdout = []
roc_auc_train = []
folds = []
# Sudah di tuning
# model = xgb.XGBClassifier(colsample_bytree=0.7, eta=0.2, gamma=0.4,
# learning_rate=0.05, max_depth=8,
# min_child_weight=5,
# n_estimators=100, n_jobs=-1)
model = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=0.7, eta=0.005, gamma=0.0,
gpu_id=-1, importance_type='gain', interaction_constraints='',
learning_rate=0.1, max_delta_step=0, max_depth=4,
min_child_weight=5, monotone_constraints='()',
n_estimators=100, n_jobs=4, num_parallel_tree=1,
objective='binary:logistic', random_state=42, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', use_label_encoder=True,
validate_parameters=1, verbosity=None)
# model = xgb.XGBClassifier()
kfold = StratifiedKFold(n_splits= 10, random_state = 42,shuffle =True)
for i,(train_index, test_index) in enumerate(kfold.split(X_train, y_train)):
X1_train, X1_valid = X[train_index], X[test_index]
y1_train, y1_valid = y[train_index], y[test_index]
model.fit(X1_train, y1_train)
train_pred = model.predict_proba(X1_train)[:,1] # 70%
#Measure of the fit of your model.
pred = model.predict_proba(X1_valid)[:,1] # 10%
# DATA WHICH MODEL HAS NOT SEEN
pred_holdout = model.predict_proba(X_test)[:,1] # 20%
print('Prediction length on validation set, XGBoost Classifier, fold ', i, ': ', len(pred))
folds.append(i)
roc_auc_list.append(roc_auc_score(y1_valid, pred))
roc_auc_holdout.append(roc_auc_score(y_test, pred_holdout))
roc_auc_train.append(roc_auc_score(y1_train, train_pred))
rg = np.arange(0.840,0.870,0.005)
train_mean = np.mean(roc_auc_train)
test_mean = np.mean(roc_auc_holdout)
val_mean = np.mean(roc_auc_list)
train_std = np.std(roc_auc_train)
test_std = np.std(roc_auc_holdout)
val_std = np.std(roc_auc_list)
plt.style.use('tableau-colorblind10')
fig, ax = plt.subplots(figsize=(20,10))
ax.plot(roc_auc_train, label='Train', marker='o', linestyle='-.')
ax.plot(roc_auc_holdout, label='Test', marker='o', linestyle=':')
ax.plot(roc_auc_list, label='Val', marker='o', linestyle='--')
text_m = '''
* Train Mean : ''' + str(format(train_mean, '.5f')) + '''
* Test Mean : ''' + str(format(test_mean, '.5f')) + '''
* Val Mean : ''' + str(format(val_mean, '.5f')) + '''
'''
ax.text(6,0.841,text_m,horizontalalignment='left',color='black',fontsize=16,fontweight='normal')
text_s = '''
* Train Standard Deviation : ''' + str(format(train_std, '.5f')) + '''
* Test Standard Deviation : ''' + str(format(test_std, '.5f')) + '''
* Val Standard Deviation : ''' + str(format(val_std, '.5f')) + '''
'''
ax.text(0.5,0.841,text_s,horizontalalignment='left',color='black',fontsize=16,fontweight='normal')
ax.set_xlabel('No of variable at each split', fontsize=18, labelpad=20)
ax.set_ylabel('ROC_AUC Score', fontsize=18, labelpad=10)
ax.set_title('XGBoost - Train, Test, Val Error', pad=20, fontsize=30)
ax.legend()
ax.set_yticks(rg)
sns.despine()
plt.savefig('./xgb-ttv.jpg')
plt.tight_layout()
plt.show();